This Dataset contains 568,455 reviews by customers on products. The columns in the table are:
- Id
- ProductId - unique identifier for the product
- UserId - unqiue identifier for the user
- ProfileName
- HelpfulnessNumerator - number of users who found the review helpful
- HelpfulnessDenominator - number of users who indicated whether they found the review helpful
- Score - rating between 1 and 5
- Time - timestamp for the review
- Summary - brief summary of the review
- Text - text of the review
In [1]:
import sqlite3
import pandas as pd
con = sqlite3.connect('database.sqlite')
In [2]:
dat = pd.read_sql_query("""SELECT * FROM Reviews""", con)
In [3]:
dat.head(5)
Out[3]:
In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
In [53]:
print 'There are %f percentage unique products'% (len(set(dat['ProductId']))/(len(dat['ProductId'])*1.))
In [54]:
print 'There are %f percentage unique customers'% (len(set(dat['UserId']))/(len(dat['UserId'])*1.))
In [55]:
dat['Score'].plot.hist()
Out[55]:
In [4]:
def labelData(score):
if score >= 4:
return 'positive'
return 'negative'
dat['PScore'] = dat['Score'].map(labelData)
In [5]:
dat[['Score','PScore']].head(5)
Out[5]:
In [6]:
def combineText(row):
return row['Summary'] +" "+ row['Text']
dat['reviews'] = dat.apply(combineText,axis=1)
In [7]:
dat[['reviews','PScore']].head(5)
Out[7]:
In [8]:
dat[['reviews','PScore']].to_pickle('amzreviews')
In [20]:
dat = pd.read_pickle('amzreviews')
In [21]:
dat.head(5)
Out[21]:
In [22]:
reviews = dat['reviews'].tolist()[:10000]
In [23]:
reviews[:2]
Out[23]:
In [24]:
len(reviews)
Out[24]:
In [27]:
from nltk.tokenize import word_tokenize
In [28]:
tokenized_reviews = []
for doc in reviews:
tokenized_reviews.append(word_tokenize(doc))
if len(tokenized_reviews) % 10000 == 0:
print len(tokenized_reviews)
In [29]:
tokenized_reviews[:3]
Out[29]:
In [30]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
In [31]:
tokenized_reviews_no_punctuation = []
for review in tokenized_reviews:
new_review = []
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review.append(new_token)
tokenized_reviews_no_punctuation.append(new_review)
print tokenized_reviews_no_punctuation[:3]
In [33]:
print len(tokenized_reviews_no_punctuation)
In [19]:
from nltk.corpus import stopwords
tokenized_reviews_no_stopwords = []
for doc in tokenized_reviews_no_punctuation:
new_term_vector = []
for word in doc:
if not word in stopwords.words('english'):
new_term_vector.append(word)
tokenized_reviews_no_stopwords.append(new_term_vector)
print tokenized_reviews_no_stopwords[:4]
In [20]:
len(tokenized_reviews_no_stopwords)
Out[20]:
In [34]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
preprocessed_docs = []
for doc in tokenized_reviews_no_punctuation:
#for doc in tokenized_reviews_no_stopwords:
final_doc = []
for word in doc:
final_doc.append(porter.stem(word))
preprocessed_docs.append(final_doc)
print preprocessed_docs[:4]
In [35]:
# to reduce memory
del tokenized_reviews
del tokenized_reviews_no_punctuation
#del tokenized_reviews_no_stopwords
In [36]:
len(preprocessed_docs)
Out[36]:
In [37]:
final_reviews = []
for doc in preprocessed_docs:
final_reviews.append((" ".join(doc)).lower())
print final_reviews[:2]
In [38]:
print len(final_reviews)
In [39]:
scores = dat['PScore'].tolist()[:10000]
In [40]:
len(scores)
Out[40]:
In [41]:
from collections import Counter
Counter(scores)
Out[41]:
In [42]:
dat = pd.DataFrame({'reviews':final_reviews,'scores':scores})
In [43]:
dat.head(5)
Out[43]:
In [44]:
dat.to_pickle('reviewsAndscores')
In [45]:
dat = pd.read_pickle('reviewsAndscores')
In [46]:
dat.head(5)
Out[46]:
For our prediction we do logistic regression and naive bayes classifier
In [47]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
In [48]:
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(dat['reviews'])
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
In [49]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train_tfidf, dat['scores'], test_size=0.2, random_state=21)
In [50]:
from sklearn import linear_model
prediction = dict()
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, Y_train)
prediction['Logistic'] = logreg.predict(X_test)
In [51]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, Y_train)
prediction['Multinomial'] = model.predict(X_test)
In [52]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB().fit(X_train, Y_train)
prediction['Bernoulli'] = model.predict(X_test)
In [53]:
%matplotlib inline
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
def format(x):
if x == 'negative':
return 0
return 1
vfunc = np.vectorize(format)
cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.map(format), vfunc(predicted))
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
cmp += 1
plt.title('Classifiers comparaison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [54]:
print(metrics.classification_report(Y_test, prediction['Logistic'], target_names = ["positive", "negative"]))
In [55]:
from sklearn.metrics import confusion_matrix
In [56]:
confusion_matrix(Y_test, prediction['Logistic'])
Out[56]:
In [58]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(set(scores)))
plt.xticks(tick_marks, set(scores), rotation=45)
plt.yticks(tick_marks, set(scores))
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cm = confusion_matrix(Y_test, prediction['Logistic'])
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.show()